library("tidyverse")
library("grid")
library("gridExtra")
library("scales")
data(Yamaguchi87, package="vcdExtra")
Yamaguchi87 <- tbl_df(Yamaguchi87)
grid_arrange_shared_legend <- function(..., ncol = length(list(...)), nrow = 1, position = c("bottom", "right")) {
plots <- list(...)
position <- match.arg(position)
g <- ggplotGrob(plots[[1]] + theme(legend.position = position))$grobs
legend <- g[[which(sapply(g, function(x) x$name) == "guide-box")]]
lheight <- sum(legend$height)
lwidth <- sum(legend$width)
gl <- lapply(plots, function(x) x + theme(legend.position="none"))
gl <- c(gl, ncol = ncol, nrow = nrow)
combined <- switch(position,
"bottom" = arrangeGrob(do.call(arrangeGrob, gl),
legend,
ncol = 1,
heights = unit.c(unit(1, "npc") - lheight, lheight)),
"right" = arrangeGrob(do.call(arrangeGrob, gl),
legend,
ncol = 2,
widths = unit.c(unit(1, "npc") - lwidth, lwidth)))
grid.newpage()
grid.draw(combined)
}
p45a <- Yamaguchi87 %>%
select(Country, Son, Freq) %>%
group_by(Country, Son) %>%
summarise(Freq = sum(Freq)) %>%
mutate(Percentage=Freq/sum(Freq)) %>%
mutate(Occupation=Son) %>%
ggplot(aes(x=Occupation, y=Percentage, fill=Occupation)) +
geom_bar(stat="identity") +
facet_grid(. ~ Country) +
ggtitle("(a) Distributions of Sons' Occupations in Three Countries") +
xlab("") + ylab("") +
scale_y_continuous(labels=percent) +
scale_fill_brewer(palette="RdPu") +
theme_linedraw()
p45b <- Yamaguchi87 %>%
filter(Country=="UK") %>%
select(-Country) %>%
gather(Generation, Occupation, -Freq) %>%
group_by(Generation, Occupation) %>%
summarise(Freq = sum(Freq)) %>%
mutate(Percentage=Freq/sum(Freq)) %>%
mutate(Occupation=factor(Occupation, levels = c("UpNM", "LoNM", "UpM", "LoM", "Farm"))) %>%
ggplot(aes(x=Occupation, y=Percentage, fill=Occupation)) +
geom_bar(stat="identity") +
facet_grid(. ~ Generation) +
ggtitle("(b) Distributions of Fathers' and Sons' occupations in UK") +
xlab("") + ylab("") +
scale_y_continuous(labels=percent) +
scale_fill_brewer(palette="RdPu") +
theme_linedraw()
grid_arrange_shared_legend(p45a, p45b, ncol = 1, nrow = 2)
US and UK have similar distribution among occupations, however Japan is different than them. We can see the percentage of each occupation in Japan is more balanced.
They are similar, but the percentage of Up Mon-Manual increased and the percentage of Farm decreased from Father to Son.
The results are what I would have expected. Data of these three countries were derived from surveys in early 1970s. At that time, UK and US are more developed and industrialized than Japan, so thy have more people work in non-agriculture area as what we see in Figure (a). With the growth of economy and technology, machines replaced more and more human workers, so the percentage of people who work in farm and low manual area decresed and the percentage of people working in non-manual area increased, which match what we see in Figure (b).
data(Scotch, package="bayesm")
Scotch <- tbl_df(Scotch)
Scotch %>%
gather(Brand, Count) %>%
filter(Brand!='Other.Brands') %>%
group_by(Brand) %>%
summarise(Consumption = sum(Count)) %>%
ggplot(aes(x=reorder(Brand, Consumption), y=Consumption, fill=Consumption)) +
geom_bar(stat="identity") +
coord_flip() +
xlab("Brand") +
ggtitle("(a) Consumption per Brand I") +
scale_fill_gradient(low="blue", high="red") +
theme_linedraw()
According to consumption data showing in Graph (a), Chivas.Regal is the best brand.
Since there’s a obvious gap between Glenfiddich and Pinch..Haig. on the graph and the number of consumption for these two are 334 and 117, I would pick a number in between, let’s say 200 as the cutoff for big brand.
data(whiskey, package="flexmix")
w1 <- tbl_df(data.frame(whiskey$Incidence * whiskey$Freq)) %>%
gather(Brand, Count) %>%
group_by(Brand) %>%
summarise(Consumption = sum(Count))
w2 <- tbl_df(whiskey_brands) %>%
mutate(Brand=gsub("([^a-zA-Z])", ".", Brand))
whiskey_all <- left_join(w1, w2, by=c("Brand" = "Brand"))
whiskey_all %>%
filter(Brand!='Other.brands') %>%
ggplot(aes(x=Consumption, y=reorder(Brand, Consumption))) +
geom_segment(aes(yend=Brand), xend=0, color='grey50') +
geom_point(size=3, aes(color=Bottled)) +
facet_grid(Type ~ ., scales='free_y', space='free_y') +
ylab("Brand") +
ggtitle("(c) Consumption per Brand II") +
scale_color_manual(values = c("royalblue1", "violetred1")) +
theme_linedraw()
Bottled category, we see that most of the Brands are coming from outside United States, so I think “whisky” is better.library("GGally")
Attaching package: ‘GGally’
The following object is masked from ‘package:dplyr’:
nasa
data(olives, package="extracat")
olives <- tbl_df(olives)
olives %>%
ggpairs(columns=3:10,
ggplot2::aes(color="orangered1"),
lower=list(continuous=wrap("points", alpha=0.3, size=0.1)),
diag=list(continuous=wrap("densityDiag", alpha=0.7, size=0.1), axisLabels='none'),
upper=list(continuous=wrap("cor", size=rel(3)))
) +
theme_linedraw() +
theme(
text=element_text(size = 7),
axis.text=element_text(size = 6),
axis.text.x=element_text(angle = 60))
plot: [1,1] [=---------------------------------------------------] 2% est: 0s
plot: [1,2] [==--------------------------------------------------] 3% est: 5s
plot: [1,3] [==--------------------------------------------------] 5% est: 5s
plot: [1,4] [===-------------------------------------------------] 6% est: 5s
plot: [1,5] [====------------------------------------------------] 8% est: 5s
plot: [1,6] [=====-----------------------------------------------] 9% est: 5s
plot: [1,7] [======----------------------------------------------] 11% est: 5s
plot: [1,8] [======----------------------------------------------] 12% est: 5s
plot: [2,1] [=======---------------------------------------------] 14% est: 5s
plot: [2,2] [========--------------------------------------------] 16% est: 5s
plot: [2,3] [=========-------------------------------------------] 17% est: 5s
plot: [2,4] [==========------------------------------------------] 19% est: 5s
plot: [2,5] [===========-----------------------------------------] 20% est: 5s
plot: [2,6] [===========-----------------------------------------] 22% est: 4s
plot: [2,7] [============----------------------------------------] 23% est: 4s
plot: [2,8] [=============---------------------------------------] 25% est: 4s
plot: [3,1] [==============--------------------------------------] 27% est: 4s
plot: [3,2] [===============-------------------------------------] 28% est: 4s
plot: [3,3] [===============-------------------------------------] 30% est: 4s
plot: [3,4] [================------------------------------------] 31% est: 4s
plot: [3,5] [=================-----------------------------------] 33% est: 4s
plot: [3,6] [==================----------------------------------] 34% est: 4s
plot: [3,7] [===================---------------------------------] 36% est: 4s
plot: [3,8] [====================--------------------------------] 38% est: 4s
plot: [4,1] [====================--------------------------------] 39% est: 4s
plot: [4,2] [=====================-------------------------------] 41% est: 4s
plot: [4,3] [======================------------------------------] 42% est: 4s
plot: [4,4] [=======================-----------------------------] 44% est: 4s
plot: [4,5] [========================----------------------------] 45% est: 3s
plot: [4,6] [========================----------------------------] 47% est: 3s
plot: [4,7] [=========================---------------------------] 48% est: 3s
plot: [4,8] [==========================--------------------------] 50% est: 3s
plot: [5,1] [===========================-------------------------] 52% est: 3s
plot: [5,2] [============================------------------------] 53% est: 3s
plot: [5,3] [============================------------------------] 55% est: 3s
plot: [5,4] [=============================-----------------------] 56% est: 3s
plot: [5,5] [==============================----------------------] 58% est: 3s
plot: [5,6] [===============================---------------------] 59% est: 3s
plot: [5,7] [================================--------------------] 61% est: 3s
plot: [5,8] [================================--------------------] 62% est: 2s
plot: [6,1] [=================================-------------------] 64% est: 2s
plot: [6,2] [==================================------------------] 66% est: 2s
plot: [6,3] [===================================-----------------] 67% est: 2s
plot: [6,4] [====================================----------------] 69% est: 2s
plot: [6,5] [=====================================---------------] 70% est: 2s
plot: [6,6] [=====================================---------------] 72% est: 2s
plot: [6,7] [======================================--------------] 73% est: 2s
plot: [6,8] [=======================================-------------] 75% est: 2s
plot: [7,1] [========================================------------] 77% est: 2s
plot: [7,2] [=========================================-----------] 78% est: 2s
plot: [7,3] [=========================================-----------] 80% est: 1s
plot: [7,4] [==========================================----------] 81% est: 1s
plot: [7,5] [===========================================---------] 83% est: 1s
plot: [7,6] [============================================--------] 84% est: 1s
plot: [7,7] [=============================================-------] 86% est: 1s
plot: [7,8] [==============================================------] 88% est: 1s
plot: [8,1] [==============================================------] 89% est: 1s
plot: [8,2] [===============================================-----] 91% est: 1s
plot: [8,3] [================================================----] 92% est: 1s
plot: [8,4] [=================================================---] 94% est: 0s
plot: [8,5] [==================================================--] 95% est: 0s
plot: [8,6] [==================================================--] 97% est: 0s
plot: [8,7] [===================================================-] 98% est: 0s
plot: [8,8] [====================================================]100% est: 0s
From the plot above we can tell Palmitoleic and Palmitic are strongly positively associated. Oleic and Palmitic, Oleic and Palmitoleic are strongly negatively associated.
Yes. All the scatter plots of Eicosenoic with others have many outliers stay at the bottom, and look like some line shape.
data("HRstars", package="GDAdata")
HRstars <- tbl_df(HRstars)
HRstars %>%
ggplot(aes(x=BV, y=V, color=BV)) +
geom_point(alpha=.3, size=.7) +
scale_color_gradient(low="blue", high="red") +
xlab("Color (BV)") + ylab("Absolute Maganitude (V)") +
scale_y_reverse() +
theme_linedraw()
Compare with the graph from Wikipedia, they are similar. Graph drawn from HRstars clearly shows White Dwarfs, V Main Sequence and III Giants. But it is probabaly due to the small dataset we have, it is not as clear as the one on Wikipedia, and lacks of details.
Compare to my graph, it seems like the graph from Wiki contains more data with BV = 0.
I colored dots by variable BV. And from the graph I made, it shows a very obvious trend from left top to right bottom.
?HRstars
No documentation for ‘HRstars’ in specified packages and libraries:
you could try ‘??HRstars’
data("HRstars", package="GDAdata")
HRstars <- tbl_df(HRstars)
data("starsCYG", package="robustbase")
starsCYG <- tbl_df(starsCYG)
glimpse(starsCYG)
Observations: 47
Variables: 2
$ log.Te <dbl> 4.37, 4.56, 4.26, 4.56, 4.30, 4.46, 3.84, 4.57, 4.26, 4.37, 3.49...
$ log.light <dbl> 5.23, 5.74, 4.93, 5.74, 5.19, 5.46, 4.65, 5.27, 5.57, 5.12, 5.73...
?starsCYG
No documentation for ‘starsCYG’ in specified packages and libraries:
you could try ‘??starsCYG’
data(bodyfat, package="MMST")
bodyfat <- tbl_df(bodyfat)
bodyfat %>%
ggparcoord(alphaLines=0.2, scale="uniminmax", groupColumn="age") +
xlab("") + ylab("") +
# scale_color_gradient(low="blue", high="red") +
theme_linedraw() +
theme(axis.text.x=element_text(angle = 15))
Yes, there are outliers. Individual outliers can be seen on most of the variables (weight, neck, chest, abdomen, hip, thign, knee, ankle and biceps). Outliers here are usually extreme values.
Height has many small subgroups and some positive correlation with weight and .
density and bodyfat are strongly negative correlated.
Yes. Because if we put the first two variables far away from each other, it will be impossible to see the negative correlation. After reordering the variables, I got a graph belowe, this new graph does a better job on showing correlations.
# 1 density
# 2 "bodyfat"
# 3 "age"
# 4 "weight"
# 5 "height"
# 6 "neck"
# 7 "chest"
# 8 "abdomen"
# 9 "hip"
# 10 "thigh"
# 11 "knee"
# 12 "ankle"
# 13 "biceps"
# 14 "forearm"
# 15 "wrist"
bodyfat %>%
ggparcoord(alphaLines=0.2, scale="uniminmax",
order=c(2,15,11,3,10,1,4,5,9,13,12,14,8,6,7),
groupColumn="age") +
xlab("") + ylab("") +
# scale_colour_manual(values = c("#00BFC4")) +
theme_linedraw() +
theme(axis.text.x=element_text(angle = 15))
data(wine, package="MMST")
wine_mmst <- tbl_df(wine)
data(wine, package="pgmm")
wine_pgmm <- tbl_df(wine)
wine_pgmm <- wine_pgmm %>%
mutate_all(funs(as.numeric)) %>%
mutate_at(vars(Type), funs(as.factor))
wine_classname <- wine_mmst %>%
select(Class=class, classdigit) %>%
distinct()
wine_all <- left_join(wine_pgmm, wine_classname, by=c("Type" = "classdigit"))
wine_all %>%
ggparcoord(columns=2:28, groupColumn="Class", alphaLines=0.2, scale="uniminmax", order="anyClass") +
xlab("") + ylab("") +
coord_flip() +
scale_colour_manual(values = c("royalblue1", "black", "violetred1")) +
# scale_color_brewer(palette="Dark2") +
theme_linedraw()
From the pcp, we can clearly see that, Flavanoids seems to be able to separate these three clases. Proline, OD280/OD315 of Diluted Wines, OD280/OD315 of Flavanoids, Hue, Alcohol maybe helpful to separate one class from the other two.
Yes, there are outliers. From the graph we can find that Flavanoids has one on the right clearly, as well as Chloride.
wine_all %>%
ggparcoord(columns=2:28, alphaLines=0.2, scale="uniminmax", groupColumn="Class") +
xlab("") + ylab("") +
ggtitle("") +
facet_grid(~Class) +
coord_flip() +
scale_colour_manual(values = c("royalblue1", "black", "violetred1")) +
theme_linedraw() +
theme(panel.spacing.x=unit(0.8, "lines"))
Color Intensity in class Barbera, Malic Acid in class Barolo and Total Phenols in class Grignolinoolives %>%
ggpairs(columns=3:10,
lower=list(continuous=wrap("points", alpha=0.3, size=0.1)),
diag=list(continuous=wrap("densityDiag", alpha=0.7, size=0.1), axisLabels='none'),
upper=list(continuous=wrap("cor", size=1.8)),
ggplot2::aes(colour=Region)
) +
theme_linedraw() +
theme(
text=element_text(size = 7),
axis.text=element_text(size = 6),
axis.text.x=element_text(angle = 90))
plot: [1,1] [=---------------------------------------------------] 2% est: 0s
plot: [1,2] [==--------------------------------------------------] 3% est: 6s
plot: [1,3] [==--------------------------------------------------] 5% est: 7s
plot: [1,4] [===-------------------------------------------------] 6% est: 7s
plot: [1,5] [====------------------------------------------------] 8% est: 7s
plot: [1,6] [=====-----------------------------------------------] 9% est: 7s
plot: [1,7] [======----------------------------------------------] 11% est: 7s
plot: [1,8] [======----------------------------------------------] 12% est: 7s
plot: [2,1] [=======---------------------------------------------] 14% est: 7s
plot: [2,2] [========--------------------------------------------] 16% est: 7s
plot: [2,3] [=========-------------------------------------------] 17% est: 7s
plot: [2,4] [==========------------------------------------------] 19% est: 7s
plot: [2,5] [===========-----------------------------------------] 20% est: 7s
plot: [2,6] [===========-----------------------------------------] 22% est: 7s
plot: [2,7] [============----------------------------------------] 23% est: 7s
plot: [2,8] [=============---------------------------------------] 25% est: 7s
plot: [3,1] [==============--------------------------------------] 27% est: 7s
plot: [3,2] [===============-------------------------------------] 28% est: 7s
plot: [3,3] [===============-------------------------------------] 30% est: 6s
plot: [3,4] [================------------------------------------] 31% est: 6s
plot: [3,5] [=================-----------------------------------] 33% est: 6s
plot: [3,6] [==================----------------------------------] 34% est: 6s
plot: [3,7] [===================---------------------------------] 36% est: 6s
plot: [3,8] [====================--------------------------------] 38% est: 6s
plot: [4,1] [====================--------------------------------] 39% est: 6s
plot: [4,2] [=====================-------------------------------] 41% est: 6s
plot: [4,3] [======================------------------------------] 42% est: 6s
plot: [4,4] [=======================-----------------------------] 44% est: 5s
plot: [4,5] [========================----------------------------] 45% est: 5s
plot: [4,6] [========================----------------------------] 47% est: 5s
plot: [4,7] [=========================---------------------------] 48% est: 5s
plot: [4,8] [==========================--------------------------] 50% est: 5s
plot: [5,1] [===========================-------------------------] 52% est: 5s
plot: [5,2] [============================------------------------] 53% est: 4s
plot: [5,3] [============================------------------------] 55% est: 4s
plot: [5,4] [=============================-----------------------] 56% est: 4s
plot: [5,5] [==============================----------------------] 58% est: 4s
plot: [5,6] [===============================---------------------] 59% est: 4s
plot: [5,7] [================================--------------------] 61% est: 4s
plot: [5,8] [================================--------------------] 62% est: 3s
plot: [6,1] [=================================-------------------] 64% est: 3s
plot: [6,2] [==================================------------------] 66% est: 3s
plot: [6,3] [===================================-----------------] 67% est: 3s
plot: [6,4] [====================================----------------] 69% est: 3s
plot: [6,5] [=====================================---------------] 70% est: 3s
plot: [6,6] [=====================================---------------] 72% est: 3s
plot: [6,7] [======================================--------------] 73% est: 2s
plot: [6,8] [=======================================-------------] 75% est: 2s
plot: [7,1] [========================================------------] 77% est: 2s
plot: [7,2] [=========================================-----------] 78% est: 2s
plot: [7,3] [=========================================-----------] 80% est: 2s
plot: [7,4] [==========================================----------] 81% est: 2s
plot: [7,5] [===========================================---------] 83% est: 2s
plot: [7,6] [============================================--------] 84% est: 1s
plot: [7,7] [=============================================-------] 86% est: 1s
plot: [7,8] [==============================================------] 88% est: 1s
plot: [8,1] [==============================================------] 89% est: 1s
plot: [8,2] [===============================================-----] 91% est: 1s
plot: [8,3] [================================================----] 92% est: 1s
plot: [8,4] [=================================================---] 94% est: 1s
plot: [8,5] [==================================================--] 95% est: 0s
plot: [8,6] [==================================================--] 97% est: 0s
plot: [8,7] [===================================================-] 98% est: 0s
plot: [8,8] [====================================================]100% est: 0s
?ggpairs
By coloring the scatter plot by Region, it is very clear that there’s some special pattern for feature eicosenoic. From the last row we can tell that all red and green dotc (North and Sardinia region) stay at the bottom.
bodyfat %>%
ggcorr(label = TRUE, label_size = 3, label_round = 2, label_alpha = TRUE, low = "royalblue1", mid = "#EEEEEE", high = "violetred1")
?ggcorr